Group-189
Keesari Shravya - 2020FC04582
Abhijith K.S. - 2020FC04193
Pranesh V - 2020FC04961
SMS Spam Collection Dataset
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import RegexpParser
[nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\shrav\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
nltk.download('punkt')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\shrav\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
True
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\shrav\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
a) Download the file and set it as a Dataframe.
df = pd.read_csv("spam.csv", encoding = 'ISO-8859-1')
df.head()
| v1 | v2 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | |
|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | NaN | NaN | NaN |
| 1 | ham | Ok lar... Joking wif u oni... | NaN | NaN | NaN |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | NaN | NaN | NaN |
| 3 | ham | U dun say so early hor... U c already then say... | NaN | NaN | NaN |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | NaN | NaN | NaN |
df.drop(df.columns[[2,3,4]], axis = 1, inplace = True)
df.columns = ['target','sms']
df.head()
| target | sms | |
|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... |
| 1 | ham | Ok lar... Joking wif u oni... |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
| 3 | ham | U dun say so early hor... U c already then say... |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... |
df['words'] = df['sms'].apply(lambda x: len(nltk.word_tokenize(x)))
print("Word Tokens:")
print(nltk.word_tokenize(df['sms'][0]))
Word Tokens: ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
df['sentences'] = df['sms'].apply(lambda x: len(nltk.sent_tokenize(x)))
print("Sentence Tokens:")
print(nltk.sent_tokenize(df['sms'][0]))
Sentence Tokens: ['Go until jurong point, crazy..', 'Available only in bugis n great world la e buffet... Cine there got amore wat...']
df['temp'] = df['target'].map({'ham': 0, 'spam': 1})
df.head()
| target | sms | words | sentences | temp | |
|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 24 | 2 | 0 |
| 1 | ham | Ok lar... Joking wif u oni... | 8 | 2 | 0 |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 37 | 2 | 1 |
| 3 | ham | U dun say so early hor... U c already then say... | 13 | 1 | 0 |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 15 | 1 | 0 |
temp = pd.DataFrame()
temp['Target'] = ['Ham','Spam']
temp['Count'] = [len(df[df['target'] == 'ham']), len(df[df['target'] == 'spam'])]
temp = temp.sort_values(by = ['Count'], ascending = False)
fig = px.bar(temp, x = 'Target', y = 'Count',
color = "Target", text_auto='', width = 600,
title = "Count Plot")
fig.update_traces(textfont_size=12, textangle = 0, textposition = "outside", cliponaxis = False)
fig.update_layout(title_x=0.5)
fig.show()
fig = px.scatter_matrix(df, dimensions=["words", "sentences"],
color = "target",
labels={
"target":"Target" },
title = "Pair Plot" )
fig.update_layout(
autosize=False,
width=900,
height=500,
title_x=0.5
)
fig.show()
b) Remove punctuations, special characters and stopwords from the text in ‘sms’ column. Convert the text to lower case.
corpus = []
for i in range(len(df)):
# Removal of punctuations, special characters
msg = re.sub('[^a-zA-Z]', ' ', df['sms'][i])
# Converting the sms to lowercase
msg = msg.lower()
msg = msg.split()
# Removal of Stopwords
msg = [word for word in msg if not word in stopwords.words('english')]
msg = ' '.join(msg)
corpus.append(msg)
for i in range(0, 5):
print("Sentence-{}: {}".format(i+1,corpus[i]))
Sentence-1: go jurong point crazy available bugis n great world la e buffet cine got amore wat Sentence-2: ok lar joking wif u oni Sentence-3: free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply Sentence-4: u dun say early hor u c already say Sentence-5: nah think goes usf lives around though
df['Corpus'] = corpus
df.head()
| target | sms | words | sentences | temp | Corpus | |
|---|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 24 | 2 | 0 | go jurong point crazy available bugis n great ... |
| 1 | ham | Ok lar... Joking wif u oni... | 8 | 2 | 0 | ok lar joking wif u oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 37 | 2 | 1 | free entry wkly comp win fa cup final tkts st ... |
| 3 | ham | U dun say so early hor... U c already then say... | 13 | 1 | 0 | u dun say early hor u c already say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 15 | 1 | 0 | nah think goes usf lives around though |
c) Create two objects X and y. Create a CountVectorizer object and split the data into training and testing sets. Train a MultinomialNB model and Display the confusion Matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Corpus']).toarray()
Y = df['temp'].values
vectorizer.get_feature_names()
['aa', 'aah', 'aaniye', 'aaooooright', 'aathi', 'ab', 'abbey', 'abdomen', 'abeg', 'abel', 'aberdeen', 'abi', 'ability', 'abiola', 'abj', 'able', 'abnormally', 'aboutas', 'abroad', 'absence', 'absolutely', 'absolutly', 'abstract', 'abt', 'abta', 'aburo', 'abuse', 'abusers', 'ac', 'academic', 'acc', 'accent', 'accenture', 'accept', 'access', 'accessible', 'accidant', 'accident', 'accidentally', 'accommodation', 'accommodationvouchers', 'accomodate', 'accomodations', 'accordin', 'accordingly', 'account', 'accounting', 'accounts', 'accumulation', 'achan', 'ache', 'achieve', 'acid', 'acknowledgement', 'acl', 'acnt', 'aco', 'across', 'act', 'acted', 'actin', 'acting', 'action', 'activ', 'activate', 'active', 'activities', 'actor', 'actual', 'actually', 'ad', 'adam', 'add', 'addamsfa', 'added', 'addicted', 'addie', 'adding', 'address', 'adds', 'adewale', 'adi', 'adjustable', 'admin', 'administrator', 'admirer', 'admission', 'admit', 'adore', 'adoring', 'adp', 'adress', 'adrian', 'ads', 'adsense', 'adult', 'adults', 'advance', 'adventure', 'adventuring', 'advice', 'advise', 'advising', 'advisors', 'ae', 'aeronautics', 'aeroplane', 'afew', 'affair', 'affairs', 'affection', 'affectionate', 'affections', 'affidavit', 'afford', 'afghanistan', 'afraid', 'africa', 'african', 'aft', 'afternon', 'afternoon', 'afternoons', 'afterwards', 'aftr', 'ag', 'agalla', 'age', 'agency', 'agent', 'agents', 'ages', 'agidhane', 'aging', 'ago', 'agree', 'ah', 'aha', 'ahead', 'ahhh', 'ahhhh', 'ahmad', 'ahold', 'aid', 'aids', 'aig', 'aight', 'aint', 'air', 'airport', 'airtel', 'aiya', 'aiyah', 'aiyar', 'aiyo', 'aj', 'ajith', 'ak', 'aka', 'akon', 'al', 'alaikkum', 'alaipayuthe', 'albi', 'album', 'alcohol', 'aldrine', 'alert', 'alertfrom', 'alerts', 'aletter', 'alex', 'alfie', 'algarve', 'algebra', 'algorithms', 'ali', 'alian', 'alibi', 'alive', 'allah', 'allalo', 'allday', 'alle', 'allo', 'allow', 'allowed', 'allows', 'almost', 'alone', 'along', 'alot', 'already', 'alright', 'alrite', 'also', 'alter', 'alternative', 'although', 'alto', 'aluable', 'alwa', 'always', 'alwys', 'amanda', 'amazing', 'ambitious', 'ambrith', 'american', 'ami', 'amigos', 'amk', 'amla', 'amma', 'ammae', 'ammo', 'among', 'amongst', 'amore', 'amount', 'amp', 'amplikater', 'amrca', 'amrita', 'ams', 'amt', 'amused', 'amy', 'ana', 'anal', 'analysis', 'anand', 'anderson', 'andre', 'andres', 'andrews', 'andros', 'angels', 'angry', 'animal', 'animation', 'anjie', 'anjola', 'anna', 'annie', 'anniversary', 'annoncement', 'announced', 'announcement', 'annoyin', 'annoying', 'anonymous', 'anot', 'another', 'ans', 'ansr', 'answer', 'answered', 'answerin', 'answering', 'answers', 'answr', 'antelope', 'antha', 'anthony', 'anti', 'antibiotic', 'anybody', 'anyhow', 'anymore', 'anyone', 'anyones', 'anyplaces', 'anythiing', 'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway', 'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apes', 'apeshit', 'aphex', 'apnt', 'apo', 'apologetic', 'apologise', 'apologize', 'apology', 'app', 'apparently', 'appeal', 'appear', 'appendix', 'applausestore', 'apple', 'applebees', 'apples', 'application', 'apply', 'applyed', 'applying', 'appointment', 'appointments', 'appreciate', 'appreciated', 'approaches', 'approaching', 'appropriate', 'approve', 'approved', 'approx', 'apps', 'appt', 'appy', 'apr', 'april', 'aproach', 'apt', 'aptitude', 'aq', 'aquarius', 'ar', 'arab', 'arabian', 'arcade', 'archive', 'ard', 'area', 'arent', 'arestaurant', 'aretaking', 'areyouunique', 'argentina', 'argh', 'argue', 'arguing', 'argument', 'arguments', 'aries', 'arise', 'arises', 'arithmetic', 'arm', 'armand', 'armenia', 'arms', 'arng', 'arngd', 'arnt', 'around', 'aroundn', 'arr', 'arrange', 'arranging', 'arrested', 'arrival', 'arrive', 'arrived', 'arrow', 'arsenal', 'art', 'artists', 'arts', 'arty', 'arul', 'arun', 'asa', 'asap', 'asda', 'ashes', 'ashley', 'ashwini', 'asia', 'asian', 'asjesus', 'ask', 'askd', 'asked', 'askin', 'asking', 'asks', 'aslamalaikkum', 'asleep', 'asp', 'aspects', 'ass', 'assessment', 'asshole', 'assistance', 'associate', 'asssssholeeee', 'assume', 'assumed', 'asthere', 'asthma', 'astne', 'astoundingly', 'astrology', 'astronomer', 'asus', 'asusual', 'ate', 'athletic', 'athome', 'atlanta', 'atlast', 'atleast', 'atm', 'atrocious', 'attach', 'attached', 'attack', 'attempt', 'atten', 'attend', 'attended', 'attending', 'attention', 'attitude', 'attraction', 'attractive', 'attracts', 'attributed', 'atural', 'auction', 'audiitions', 'audition', 'audrey', 'audrie', 'august', 'aunt', 'auntie', 'aunties', 'aunts', 'aunty', 'aust', 'australia', 'authorise', 'auto', 'autocorrect', 'av', 'ava', 'availa', 'available', 'avalarr', 'avatar', 'avble', 'ave', 'avenge', 'avent', 'avenue', 'avin', 'avo', 'avoid', 'avoiding', 'avoids', 'await', 'awaiting', 'awake', 'award', 'awarded', 'away', 'awesome', 'awkward', 'aww', 'awww', 'ax', 'axel', 'axis', 'ay', 'ayn', 'ayo', 'ba', 'baaaaaaaabe', 'baaaaabe', 'babe', 'babes', 'babies', 'baby', 'babygoodbye', 'babyjontet', 'babysit', 'babysitting', 'bac', 'back', 'backdoor', 'backwards', 'bad', 'badass', 'badly', 'badrith', 'bag', 'bags', 'bahamas', 'baig', 'bailiff', 'bajarangabali', 'bak', 'bakra', 'bakrid', 'balance', 'ball', 'baller', 'balloon', 'balls', 'bam', 'bambling', 'band', 'bandages', 'bang', 'bangb', 'bangbabes', 'bani', 'bank', 'banks', 'banned', 'banneduk', 'banter', 'bao', 'bar', 'barbie', 'barcelona', 'bare', 'barely', 'bari', 'barkleys', 'barmed', 'barolla', 'barred', 'barrel', 'barring', 'barry', 'bars', 'base', 'based', 'bash', 'basic', 'basically', 'basket', 'basketball', 'basq', 'bat', 'batch', 'batchlor', 'bath', 'bathe', 'bathing', 'bathroom', 'batsman', 'batt', 'battery', 'battle', 'bawling', 'bay', 'bb', 'bbc', 'bbd', 'bbdeluxe', 'bbq', 'bc', 'bcaz', 'bck', 'bcm', 'bcmsfwc', 'bcoz', 'bcum', 'bcums', 'bcz', 'bday', 'beach', 'beads', 'bear', 'bears', 'beatings', 'beauties', 'beautiful', 'beauty', 'bec', 'becaus', 'becausethey', 'become', 'becomes', 'becoz', 'becz', 'bed', 'bedrm', 'bedroom', 'beeen', 'beehoon', 'beendropping', 'beer', 'beerage', 'beers', 'befor', 'beforehand', 'beg', 'beggar', 'begging', 'begin', 'begins', 'begun', 'behalf', 'behave', 'behind', 'bein', 'believe', 'belive', 'bell', 'bellearlier', 'belligerent', 'belly', 'belong', 'belongs', 'belovd', 'beloved', 'belt', 'ben', 'bend', 'beneath', 'beneficiary', 'benefits', 'bennys', 'bergkamp', 'beside', 'best', 'bet', 'beta', 'beth', 'betta', 'better', 'bettersn', 'bettr', 'beverage', 'bevies', 'beware', 'beyond', 'bf', 'bffs', 'bfore', 'bhaji', 'bhamb', 'bhaskar', 'bhayandar', 'bian', 'biatch', 'bid', 'bids', 'big', 'bigger', 'biggest', 'bike', 'bill', 'billed', 'billing', 'billion', 'bills', 'billy', 'bilo', 'bimbo', 'bin', 'biola', 'bird', 'birds', 'birla', 'biro', 'birth', 'birthdate', 'birthday', 'bishan', 'bit', 'bitch', 'bitching', 'bite', 'bites', 'bits', 'biz', 'bk', 'black', 'blackberry', 'blacko', 'blah', 'blake', 'blame', 'blank', 'blanked', 'blanket', 'blankets', 'blastin', 'bleak', 'bleh', 'bless', 'blessed', 'blessing', 'blessings', 'blimey', 'blind', 'block', 'blocked', 'blog', 'blogging', 'blogspot', 'bloke', 'blokes', 'blonde', 'bloo', 'blood', 'bloody', 'bloomberg', 'blow', 'blowing', 'blown', 'blu', 'blue', 'bluetooth', 'bluetoothhdset', 'bluff', 'blur', 'bluray', 'bmw', 'board', 'boat', 'boatin', 'bob', 'body', 'boggy', 'bognor', 'bold', 'bollox', 'boltblue', 'bomb', 'bone', 'bong', 'bonus', 'boo', 'boobs', 'book', 'booked', 'bookedthe', 'booking', 'bookmark', 'books', 'bookshelf', 'boooo', 'boost', 'booty', 'bootydelious', 'borderline', 'bored', 'borin', 'boring', 'born', 'borrow', 'boss', 'boston', 'bot', 'bother', 'bothering', 'bottle', 'bottom', 'bought', 'boundaries', 'bout', 'bowa', 'bowl', 'bowls', 'box', 'boy', 'boye', 'boyf', 'boyfriend', 'boys', 'boytoy', 'bp', 'bpo', 'brah', 'brain', 'braindance', 'brainless', 'brains', 'brainy', 'brand', 'brandy', 'bras', 'brats', 'braved', 'bray', 'brb', 'brdget', 'bread', 'breadstick', 'break', 'breaker', 'breakfast', 'breakin', 'breaking', 'breaks', 'breath', 'breathe', 'breather', 'breathing', 'breeze', 'breezy', 'bremoved', 'bribe', 'bridal', 'bridge', 'bridgwater', 'brief', 'bright', 'brighten', 'brilliant', 'brilliantly', 'brin', 'bring', 'bringing', 'brings', 'brisk', 'brison', 'bristol', 'british', 'britney', 'bro', 'broad', 'broadband', 'broke', 'broken', 'brolly', 'bros', 'broth', 'brothas', 'brother', 'brothers', 'brought', 'brown', 'brownie', 'brownies', 'browse', 'browser', 'browsin', 'bruce', 'brum', 'bruv', 'bslvyl', 'bsn', 'bsnl', 'bstfrnd', 'bt', 'btw', 'btwn', 'bubbletext', 'bucks', 'bud', 'buddy', 'buddys', 'budget', 'buen', 'buff', 'buffet', 'buffy', 'bugis', 'build', 'building', 'built', 'bulbs', 'bull', 'bullshit', 'bunch', 'bundle', 'bunkers', 'buns', 'burden', 'burger', 'burgundy', 'burial', 'burn', 'burning', 'burns', 'burnt', 'burrito', 'bus', 'buses', 'busetop', 'business', 'busty', 'busy', 'butt', 'buttheres', 'butting', 'buttons', 'buy', 'buyer', 'buyers', 'buying', 'buzy', 'buzz', 'buzzzz', 'bw', 'bx', 'byatch', 'bye', 'cab', 'cabin', 'cable', 'cafe', 'cage', 'cake', 'cakes', 'cal', 'calculated', 'calculation', 'cali', 'calicut', 'california', 'call', 'callback', 'callcost', 'calld', 'called', 'caller', 'callers', 'callertune', 'callfreefone', 'callin', 'calling', 'callon', 'calls', 'calm', 'cam', 'camcorder', 'came', 'camera', 'camp', 'campus', 'camry', 'canada', 'canal', 'canary', 'cancel', 'canceled', 'cancelled', 'cancer', 'canlove', 'cann', 'canname', 'cannot', 'cant', 'cantdo', 'canteen', 'capacity', 'capital', 'cappuccino', 'caps', 'captain', 'captaining', 'car', 'card', 'cardiff', 'cardin', 'cards', 'care', 'careabout', 'cared', 'career', 'careers', 'careful', 'carefully', 'careless', 'cares', 'caring', 'carlie', 'carlin', 'carlos', 'carly', 'carolina', 'caroline', 'carpark', 'carry', 'carryin', 'cars', 'cartons', 'cartoon', 'case', 'cash', 'cashbin', 'cashed', 'cashto', 'casing', 'cast', 'casting', 'castor', 'casualty', 'cat', 'catch', 'catches', 'catching', ...]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print("Size of X,y in train and test data")
print(f"X train: {x_train.shape} \nX test: {x_test.shape} \ny train: {y_train.shape} \ny test: {y_test.shape}")
Size of X,y in train and test data X train: (4457, 7546) X test: (1115, 7546) y train: (4457,) y test: (1115,)
nb = MultinomialNB()
nb.fit(x_train, y_train)
yp_train = nb.predict(x_train)
yp_test = nb.predict(x_test)
Classification Report for Train & test data
cm_train = confusion_matrix(y_train, yp_train)
train_cmd = ConfusionMatrixDisplay(cm_train)
s1 = round((cm_train[0,0]/(cm_train[0,0] + cm_train[0,1])),4)
print("**************************************************************************")
print("Classification Report for TRAIN DATA\n")
print(classification_report(y_train, yp_train))
print("Specificity on Train Data: ", s1)
print("Accuracy on Train Data: ", round(accuracy_score(y_train, yp_train),4))
print("F1 Score on Train Data: ", round(f1_score(y_train, yp_train),4))
print("**************************************************************************")
**************************************************************************
Classification Report for TRAIN DATA
precision recall f1-score support
0 1.00 1.00 1.00 3860
1 0.97 0.98 0.97 597
accuracy 0.99 4457
macro avg 0.98 0.99 0.99 4457
weighted avg 0.99 0.99 0.99 4457
Specificity on Train Data: 0.9953
Accuracy on Train Data: 0.993
F1 Score on Train Data: 0.9741
**************************************************************************
cm_test = confusion_matrix(y_test, yp_test)
test_cmd = ConfusionMatrixDisplay(cm_test)
s2 = round((cm_test[0,0]/(cm_test[0,0] + cm_test[0,1])),4)
print("**************************************************************************")
print("\nClassification Report for TEST DATA\n")
print(classification_report(y_test, yp_test))
print("Specificity on Test Data: ", s2)
print("Accuracy on Test Data: ", round(accuracy_score(y_test, yp_test), 4))
print("F1 Score Test Data: ", round(f1_score(y_test, yp_test), 4))
print("**************************************************************************")
**************************************************************************
Classification Report for TEST DATA
precision recall f1-score support
0 0.99 0.98 0.98 965
1 0.86 0.94 0.90 150
accuracy 0.97 1115
macro avg 0.93 0.96 0.94 1115
weighted avg 0.97 0.97 0.97 1115
Specificity on Test Data: 0.9762
Accuracy on Test Data: 0.9713
F1 Score Test Data: 0.8981
**************************************************************************
Confusion Matrix for Train Data
print("Confusion Matrix for Train Data")
train_cmd.plot()
Confusion Matrix for Train Data
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b8cf39db80>
Confusion Matrix for Test Data
print("Confusion Matrix for Test Data")
test_cmd.plot()
Confusion Matrix for Test Data
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b8cf57cfa0>
df.head()
| target | sms | words | sentences | temp | Corpus | |
|---|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 24 | 2 | 0 | go jurong point crazy available bugis n great ... |
| 1 | ham | Ok lar... Joking wif u oni... | 8 | 2 | 0 | ok lar joking wif u oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 37 | 2 | 1 | free entry wkly comp win fa cup final tkts st ... |
| 3 | ham | U dun say so early hor... U c already then say... | 13 | 1 | 0 | u dun say early hor u c already say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 15 | 1 | 0 | nah think goes usf lives around though |
d) Display the POS tagging on the first 4 rows of ‘sms’.
first_four_rows=list(df['Corpus'][:4])
print("POS tagging on first 4 rows of 'sms'")
for i,sent in enumerate(first_four_rows):
tokens_tag = pos_tag(sent.split())
print(f"Sentence {i+1} : {tokens_tag} ")
POS tagging on first 4 rows of 'sms'
Sentence 1 : [('go', 'VB'), ('jurong', 'JJ'), ('point', 'NN'), ('crazy', 'NN'), ('available', 'JJ'), ('bugis', 'NN'), ('n', 'RB'), ('great', 'JJ'), ('world', 'NN'), ('la', 'NN'), ('e', 'VBP'), ('buffet', 'JJ'), ('cine', 'NN'), ('got', 'VBD'), ('amore', 'RB'), ('wat', 'JJ')]
Sentence 2 : [('ok', 'JJ'), ('lar', 'JJ'), ('joking', 'NN'), ('wif', 'NN'), ('u', 'JJ'), ('oni', 'NN')]
Sentence 3 : [('free', 'JJ'), ('entry', 'NN'), ('wkly', 'VBD'), ('comp', 'NN'), ('win', 'NN'), ('fa', 'JJ'), ('cup', 'VBZ'), ('final', 'JJ'), ('tkts', 'NN'), ('st', 'NN'), ('may', 'MD'), ('text', 'VB'), ('fa', 'JJ'), ('receive', 'JJ'), ('entry', 'NN'), ('question', 'NN'), ('std', 'VBD'), ('txt', 'JJ'), ('rate', 'NN'), ('c', 'NNS'), ('apply', 'VBP')]
Sentence 4 : [('u', 'JJ'), ('dun', 'NNS'), ('say', 'VBP'), ('early', 'JJ'), ('hor', 'NN'), ('u', 'JJ'), ('c', 'NN'), ('already', 'RB'), ('say', 'VB')]
e) Build and display a dependency parser tree for the sentence :
import spacy
nlp=spacy.load('en_core_web_sm')
from spacy import displacy
txt=nlp(u"the series opened 17 years later, as Viserys Targaryen tried to win an eastern tribal army to his side, so he could retake the Iron Throne")
displacy.render(txt,jupyter=True)
Showing POS tagging for each word in the given sentence
token_data=[]
for token in txt:
token_row_data = [token.text, token.pos_, token.dep_, (token.head.text, token.head.i), token.head.pos_,
[child.i for child in token.children]]
token_data.append(token_row_data)
token_data_df = pd.DataFrame(token_data, columns=['text', 'POS', 'dependency', 'head', 'head_POS', 'children'])
token_data_df.head()
| text | POS | dependency | head | head_POS | children | |
|---|---|---|---|---|---|---|
| 0 | the | DET | det | (series, 1) | NOUN | [] |
| 1 | series | NOUN | nsubj | (opened, 2) | VERB | [0] |
| 2 | opened | VERB | ccomp | (retake, 24) | VERB | [1, 5, 6, 10] |
| 3 | 17 | NUM | nummod | (years, 4) | NOUN | [] |
| 4 | years | NOUN | npadvmod | (later, 5) | ADV | [3] |